In [288]:
# Regression. Numeric and Categorical Predictors. Dummy Variables and Interactions.
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Set working directory and load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
Auto = pd.read_csv("Auto.csv")  # Read the data file in the CSV format
In [290]:
# Prepare data frames for X and Y, fit a linear regression model, and plot the regression line with data

OurDataFrame = pd.DataFrame(Auto)
Weight = OurDataFrame['weight']
MPG    = OurDataFrame['mpg']
X      = sm.add_constant(Weight)

reg = sm.OLS( MPG, X ).fit()
print(reg.summary())

plt.scatter(Weight, MPG, label='Data', s=15)
plt.plot(Weight, reg.predict(X), color='red', label='Regression Line')
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); plt.title('Linear regression line'); 
plt.legend(); plt.show()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.692
Model:                            OLS   Adj. R-squared:                  0.691
Method:                 Least Squares   F-statistic:                     886.6
Date:                Wed, 31 Jul 2024   Prob (F-statistic):          5.37e-103
Time:                        10:29:59   Log-Likelihood:                -1146.0
No. Observations:                 397   AIC:                             2296.
Df Residuals:                     395   BIC:                             2304.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         46.3174      0.796     58.166      0.000      44.752      47.883
weight        -0.0077      0.000    -29.776      0.000      -0.008      -0.007
==============================================================================
Omnibus:                       40.133   Durbin-Watson:                   0.797
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               56.057
Skew:                           0.712   Prob(JB):                     6.72e-13
Kurtosis:                       4.166   Cond. No.                     1.13e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [292]:
# Is it the same linear relationship for American, Asian, and European cars?

# Map colors according to 'origin' and plot weight vs mpg colored by the origin
Auto['color'] = Auto['origin'].map({1: 'orange', 2: 'blue', 3: 'green'})  # Replace the color names with your desired colors

plt.scatter(Auto['weight'], Auto['mpg'], c=Auto['color'])
plt.xlabel('Weight'); plt.ylabel('MPG')
plt.title('Weight vs MPG colored by Continent')
plt.show()
No description has been provided for this image
In [294]:
# The line becomes less steep for larger cars.
In [320]:
# Fit a linear regression model with dummy variables, allowing different INTERCEPTS

DummyColumns = pd.get_dummies(Auto['origin'], dtype=int, drop_first=True)    # Prepare dummy variables
Dummies = pd.get_dummies(DummyColumns, dtype=int)
Dummies = Dummies.rename(columns={2:'origin2', 3:'origin3'})
X = sm.add_constant(Dummies)
In [322]:
reg_dummies = sm.OLS(MPG, X).fit()
print(reg_dummies.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.333
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     98.45
Date:                Wed, 31 Jul 2024   Prob (F-statistic):           2.12e-35
Time:                        10:40:18   Log-Likelihood:                -1299.2
No. Observations:                 397   AIC:                             2604.
Df Residuals:                     394   BIC:                             2616.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         20.0718      0.407     49.339      0.000      19.272      20.872
origin2        7.8197      0.867      9.018      0.000       6.115       9.524
origin3       10.3789      0.828     12.540      0.000       8.752      12.006
==============================================================================
Omnibus:                       25.088   Durbin-Watson:                   0.753
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               28.611
Skew:                           0.657   Prob(JB):                     6.13e-07
Kurtosis:                       3.020   Cond. No.                         3.16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [324]:
# Plot the resulting regression lines in different colors

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_dummies.predict(X), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with different intercepts and no slopes'); 
plt.show()
No description has been provided for this image
In [342]:
# Include a common slope. 
X1 = pd.concat([X, Weight], axis=1)

reg_int_oneslope = sm.OLS(MPG,X1).fit()
print(reg_int_oneslope.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.702
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     307.9
Date:                Wed, 31 Jul 2024   Prob (F-statistic):          8.82e-103
Time:                        10:47:43   Log-Likelihood:                -1139.6
No. Observations:                 397   AIC:                             2287.
Df Residuals:                     393   BIC:                             2303.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         43.6896      1.107     39.481      0.000      41.514      45.865
origin2        1.2190      0.654      1.865      0.063      -0.066       2.504
origin3        2.3592      0.663      3.556      0.000       1.055       3.663
weight        -0.0070      0.000    -22.021      0.000      -0.008      -0.006
==============================================================================
Omnibus:                       37.597   Durbin-Watson:                   0.813
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               54.086
Skew:                           0.662   Prob(JB):                     1.80e-12
Kurtosis:                       4.232   Cond. No.                     1.82e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.82e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [346]:
# This regression will have a slightly better R^2 and adjusted R^2 than the original model. 
# The grouping and the slope are significant. Plot:

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_int_oneslope.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with different intercepts and one common slope'); 
plt.show()
No description has been provided for this image
In [352]:
# Fit a linear regression model with interaction terms, allowing different slopes for different continents
X1['origin2_weight'] = X1['origin2']*X1['weight']
X1['origin3_weight'] = X1['origin3']*X1['weight']

reg_interactions = sm.OLS(MPG,X1).fit()
reg_interactions.summary()
Out[352]:
OLS Regression Results
Dep. Variable: mpg R-squared: 0.706
Model: OLS Adj. R-squared: 0.703
Method: Least Squares F-statistic: 188.1
Date: Wed, 31 Jul 2024 Prob (F-statistic): 1.14e-101
Time: 10:54:14 Log-Likelihood: -1136.4
No. Observations: 397 AIC: 2285.
Df Residuals: 391 BIC: 2309.
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 42.9846 1.179 36.465 0.000 40.667 45.302
origin2 2.3912 2.847 0.840 0.401 -3.206 7.988
origin3 11.2755 3.583 3.147 0.002 4.231 18.320
weight -0.0068 0.000 -19.973 0.000 -0.007 -0.006
origin2_weight -0.0004 0.001 -0.365 0.715 -0.003 0.002
origin3_weight -0.0039 0.002 -2.527 0.012 -0.007 -0.001
Omnibus: 42.084 Durbin-Watson: 0.819
Prob(Omnibus): 0.000 Jarque-Bera (JB): 61.346
Skew: 0.720 Prob(JB): 4.78e-14
Kurtosis: 4.278 Cond. No. 5.36e+04


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.36e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [354]:
# This regression will have a slightly better R^2 and adjusted R^2 than the original model. 
# The grouping and the slope are significant. Plot:

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_interactions.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with origin-weight interactions'); 
plt.show()
No description has been provided for this image